IMPORT

library(ggplot2)
library(tidyverse)
library(dplyr)
library(harmony)
library(Seurat)
# Import seurat object
seur.human <- readRDS("~/Projects/20220809_Thymic-iNKT-CrossSpecies/data/raw_data/human_data/filtered_seurat_Harmony_07-22-22.RDS")

# Sanity check
print(seur.human) # 79,801 cells (it's the whole seurat object)
## An object of class Seurat 
## 34778 features across 79801 samples within 2 assays 
## Active assay: SCT (17389 features, 3000 variable features)
##  1 other assay present: RNA
##  3 dimensional reductions calculated: pca, umap, harmony
# Quick visualization
DimPlot(seur.human)

DEFINE FUNCTIONS

Preprocess <- function(seuratobject, SCTsplit=T, res=0.5, harmony_vars= c("Method", "Batch"), printComputation=T){
  
    # Extract the counts data (from RNA assay) and metadata
  counts <- GetAssayData(object = seuratobject, slot = "counts", assay="RNA")
  metadata <- seuratobject@meta.data
  metadata[, c("nCount_SCT", "nFeature_SCT", "SCT_snn_res.0.7", "seurat_clusters", "SCT_snn_res.0.9", "SCT_snn_res.1")] <- NULL # remove some cluster columns to avoid any confusion
  colnames(metadata)[14] <- "clusters_bigUMAP" # rename "new_clusters" column
  # Create new seurat object with only the counts and the "cleaned" metadata
  initseurat <- CreateSeuratObject(counts = counts, meta.data = metadata, min.cells=20) # keep only genes expressed in at least 20 cells
  
  cat("Initial seurat object:\n")
  print(initseurat) # this is for sanity check
  
  if(SCTsplit==T){
    cat("\n1-Perform SCTransform on split data\n")
    # Normalize with SCTransform by splitting object (per batch)
    seur.list <- SplitObject(initseurat, split.by = "orig.ident")
    for(i in 1:length(seur.list)){
      cat("...SCTransform on", names(seur.list)[i], "\n")
      seur.list[[i]] <-  SCTransform(seur.list[[i]], vars.to.regress = 'percent.mt', verbose=printComputation)
    }
    # seur.list <- lapply(X = seur.list, FUN = SCTransform, vars.to.regress="percent.mt", method="glmGamPois") # not working
    sct.hvg  <- SelectIntegrationFeatures(object.list = seur.list, nfeatures = 3000)
    seur.SCT <- merge(seur.list[[1]], y = unlist(seur.list[-1], use.names=FALSE), merge.data = TRUE)
    VariableFeatures(seur.SCT) <- sct.hvg
  }
  
  else if(SCTsplit==F){
    cat("\n1-Perform SCTransform on merged data")
    # Normalize with SCTransform (without splitting object)
    seur.SCT <- SCTransform(initseurat,
                            assay="RNA",
                            new.assay.name="SCT",
                            vars.to.regress = c('percent.mt'),
                            verbose=printComputation)
  }
  
  # Run PCA (don't scale data when using SCTransform)
  cat("\n2-Run PCA pre-integration")
  seur.SCT <- RunPCA(object = seur.SCT, assay = "SCT", npcs = 50, verbose=printComputation)
  p1 <- DimPlot(seur.SCT, reduction = "pca", group.by = "orig.ident") + ggtitle("PCA pre-integration")
  ElbowPlot(seur.SCT)
  cat("\nSeurat object pre-integration:\n")
  print(seur.SCT)
  
  # Run Harmony for integration
  cat("\n3-Run Harmony\n")
  seur.harmony <- RunHarmony(object = seur.SCT,
                             assay.use = "SCT",
                             reduction = "pca",
                             dims.use = 1:50,
                             group.by.vars = harmony_vars,
                             plot_convergence = printComputation)
  cat("\nSeurat object post-integration:\n")
  print(seur.harmony)
  cat("\n4-Run UMAP & clustering\n")
  seur.harmony <- RunUMAP(object = seur.harmony, dims = 1:15, assay = "SCT", reduction = "harmony", verbose=printComputation)
  seur.harmony <- FindNeighbors(object = seur.harmony, dims = 1:15, assay = "SCT", reduction = "harmony", verbose=printComputation) %>%
    FindClusters(resolution = res, verbose=printComputation)
  
  # Show PCA after integration
  p2 <- DimPlot(seur.harmony, reduction = "harmony", group.by = "orig.ident") + ggtitle("PCA post-integration")
  
  print(p1 | p2)
  
  return(seur.harmony)
}

THYMIC MAIT CELLS

1. Preprocessing

# Isolate MAIT cells
MAIT_Thymus <- subset(seur.human, ident = "MAIT_Thymus") # 4689 cells

# Quick QC
FeatureScatter(MAIT_Thymus, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by="Batch")

FeatureScatter(MAIT_Thymus, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by="Batch")

# PREPROCESS
MAIT_SCTsplit <- Preprocess(seuratobject = MAIT_Thymus, SCTsplit = T, res = 0.3, harmony_vars = c("Sex", "Method"), printComputation = F)
## Initial seurat object:
## An object of class Seurat 
## 10772 features across 4689 samples within 1 assay 
## Active assay: RNA (10772 features, 0 variable features)
## 
## 1-Perform SCTransform on split data
## ...SCTransform on MAIT_1_Thymus 
## ...SCTransform on MAIT_2_Thymus 
## ...SCTransform on MAIT_3_Thymus 
## 
## 2-Run PCA pre-integration
## Seurat object pre-integration:
## An object of class Seurat 
## 21544 features across 4689 samples within 2 assays 
## Active assay: SCT (10772 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## 3-Run Harmony
## 
## Seurat object post-integration:
## An object of class Seurat 
## 21544 features across 4689 samples within 2 assays 
## Active assay: SCT (10772 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, harmony
## 
## 4-Run UMAP & clustering

MAIT_SCTmerged <- Preprocess(seuratobject = MAIT_Thymus, SCTsplit = F, res = 0.3, harmony_vars = c("Sex", "Method"), printComputation = F)
## Initial seurat object:
## An object of class Seurat 
## 10772 features across 4689 samples within 1 assay 
## Active assay: RNA (10772 features, 0 variable features)
## 
## 1-Perform SCTransform on merged data
## 2-Run PCA pre-integration
## Seurat object pre-integration:
## An object of class Seurat 
## 21544 features across 4689 samples within 2 assays 
## Active assay: SCT (10772 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## 3-Run Harmony
## 
## Seurat object post-integration:
## An object of class Seurat 
## 21544 features across 4689 samples within 2 assays 
## Active assay: SCT (10772 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, harmony
## 
## 4-Run UMAP & clustering

2. Visualize

# Sanity checks
# DimPlot(seur.harmony, dims = 1:2, group.by = "orig.ident")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Sex")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Donor")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Batch")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Method")

# Look at clusters
DimPlot(MAIT_SCTsplit, dims = 1:2, reduction = "umap", group.by = "SCT_snn_res.0.3", label = T) + ggtitle("SCT on split data")

DimPlot(MAIT_SCTmerged, dims = 1:2, reduction = "umap", group.by = "SCT_snn_res.0.3", label = T) + ggtitle("SCT on merged data")

THYMIC NKT CELLS

1. Preprocessing

# Isolate NKT cells
NKT_Thymus <- subset(seur.human, ident = "NKT_Thymus") # 2551 cells

# Quick QC
FeatureScatter(NKT_Thymus, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by="Batch")

FeatureScatter(NKT_Thymus, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by="Batch")

# PREPROCESS
NKT_SCTsplit <- Preprocess(seuratobject = NKT_Thymus, SCTsplit = T, res = 0.4, harmony_vars = c("Sex", "Method"), printComputation = F)
## Initial seurat object:
## An object of class Seurat 
## 10005 features across 2551 samples within 1 assay 
## Active assay: RNA (10005 features, 0 variable features)
## 
## 1-Perform SCTransform on split data
## ...SCTransform on NKT_1_Thymus 
## ...SCTransform on NKT_2_Thymus 
## ...SCTransform on NKT_3_Thymus 
## 
## 2-Run PCA pre-integration
## Seurat object pre-integration:
## An object of class Seurat 
## 20010 features across 2551 samples within 2 assays 
## Active assay: SCT (10005 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## 3-Run Harmony
## 
## Seurat object post-integration:
## An object of class Seurat 
## 20010 features across 2551 samples within 2 assays 
## Active assay: SCT (10005 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, harmony
## 
## 4-Run UMAP & clustering

NKT_SCTmerged <- Preprocess(seuratobject = NKT_Thymus, SCTsplit = F, res = 0.4, harmony_vars = c("Sex", "Method"), printComputation = F)
## Initial seurat object:
## An object of class Seurat 
## 10005 features across 2551 samples within 1 assay 
## Active assay: RNA (10005 features, 0 variable features)
## 
## 1-Perform SCTransform on merged data
## 2-Run PCA pre-integration
## Seurat object pre-integration:
## An object of class Seurat 
## 20010 features across 2551 samples within 2 assays 
## Active assay: SCT (10005 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## 3-Run Harmony
## 
## Seurat object post-integration:
## An object of class Seurat 
## 20010 features across 2551 samples within 2 assays 
## Active assay: SCT (10005 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, harmony
## 
## 4-Run UMAP & clustering

2. Visualize

# Sanity checks
# DimPlot(seur.harmony, dims = 1:2, group.by = "orig.ident")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Sex")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Donor")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Batch")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Method")

# Look at clusters
DimPlot(NKT_SCTsplit, dims = 1:2, reduction = "umap", group.by = "SCT_snn_res.0.4", label = T) + ggtitle("SCT on split data") |
  DimPlot(NKT_SCTmerged, dims = 1:2, reduction = "umap", group.by = "SCT_snn_res.0.4", label = T) + ggtitle("SCT on merged data")

THYMIC GD T CELLS

1. Preprocessing

# Isolate gdT cells
gdT_Thymus <- subset(seur.human, ident = "GD_Thymus") # 2981 cells

# Quick QC
FeatureScatter(gdT_Thymus, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by="Batch")

FeatureScatter(gdT_Thymus, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by="Batch")

# PREPROCESS
gdT_SCTsplit <- Preprocess(seuratobject = gdT_Thymus, SCTsplit = T, res = 0.4, harmony_vars = c("Sex", "Method"), printComputation = F)
## Initial seurat object:
## An object of class Seurat 
## 11860 features across 2981 samples within 1 assay 
## Active assay: RNA (11860 features, 0 variable features)
## 
## 1-Perform SCTransform on split data
## ...SCTransform on GD_1_Thymus 
## ...SCTransform on GD_2_Thymus 
## 
## 2-Run PCA pre-integration
## Seurat object pre-integration:
## An object of class Seurat 
## 23720 features across 2981 samples within 2 assays 
## Active assay: SCT (11860 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## 3-Run Harmony
## 
## Seurat object post-integration:
## An object of class Seurat 
## 23720 features across 2981 samples within 2 assays 
## Active assay: SCT (11860 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, harmony
## 
## 4-Run UMAP & clustering

gdT_SCTmerged <- Preprocess(seuratobject = gdT_Thymus, SCTsplit = F, res = 0.4, harmony_vars = c("Sex", "Method"), printComputation = F)
## Initial seurat object:
## An object of class Seurat 
## 11860 features across 2981 samples within 1 assay 
## Active assay: RNA (11860 features, 0 variable features)
## 
## 1-Perform SCTransform on merged data
## 2-Run PCA pre-integration
## Seurat object pre-integration:
## An object of class Seurat 
## 23720 features across 2981 samples within 2 assays 
## Active assay: SCT (11860 features, 3000 variable features)
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
## 
## 3-Run Harmony
## 
## Seurat object post-integration:
## An object of class Seurat 
## 23720 features across 2981 samples within 2 assays 
## Active assay: SCT (11860 features, 3000 variable features)
##  1 other assay present: RNA
##  2 dimensional reductions calculated: pca, harmony
## 
## 4-Run UMAP & clustering

2. Visualize

# Sanity checks
# DimPlot(seur.harmony, dims = 1:2, group.by = "orig.ident")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Sex")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Donor")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Batch")
# DimPlot(seur.harmony, dims = 1:2, group.by = "Method")

# Look at clusters
DimPlot(gdT_SCTsplit, dims = 1:2, reduction = "umap", group.by = "SCT_snn_res.0.4", label = T) + ggtitle("SCT on split data") |
  DimPlot(gdT_SCTmerged, dims = 1:2, reduction = "umap", group.by = "SCT_snn_res.0.4", label = T) + ggtitle("SCT on merged data")